{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### This script transforms the data by CDC shared with restricted access\n",
    "https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Restricted-Access-Detai/mbd7-r32t"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "def convert_age_group(x):\n",
    "    if x == \"0 - 9 Years\":\n",
    "        return \"0 - 9 Years\"\n",
    "    if x == \"10 - 19 Years\":\n",
    "        return \"10 - 19 Years\"\n",
    "    if x == \"Missing\" or x ==\"NA\":\n",
    "        return \"Missing\"\n",
    "    return \"20+\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_1 = pd.read_parquet(\"./data/cdc_restricted_data/COVID_Cases_Restricted_Details_03312021_Part_1.parquet\",\n",
    "                        columns =[\"county_fips_code\", \"cdc_case_earliest_dt\", \"cdc_report_dt\",\"current_status\", \"age_group\"])\n",
    "data_2 = pd.read_parquet(\"./data/cdc_restricted_data/COVID_Cases_Restricted_Details_03312021_Part_2.parquet\",\n",
    "                        columns =[\"county_fips_code\", \"cdc_case_earliest_dt\", \"cdc_report_dt\",\"current_status\", \"age_group\"])\n",
    "data_3 = pd.read_parquet(\"./data/cdc_restricted_data/COVID_Cases_Restricted_Details_03312021_Part_3.parquet\",\n",
    "                        columns =[\"county_fips_code\", \"cdc_case_earliest_dt\", \"cdc_report_dt\",\"current_status\", \"age_group\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data_1.append(data_2).append(data_3).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"dt\"] = data.cdc_case_earliest_dt.combine_first(data.cdc_report_dt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "data[\"cnt\"]=1\n",
    "data[\"age_group\"]=data.age_group.map(convert_age_group)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_pivoted = pd.concat([data, data.pivot(columns=[\"current_status\"], values=[\"cnt\"])], axis = 1).fillna(0.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/hoktay/.pyenv/versions/3.8.0/envs/agent_based_seir/lib/python3.8/site-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n",
      "  return array(a, dtype, copy=False, order=order)\n"
     ]
    }
   ],
   "source": [
    "data_pivoted = data_pivoted[[\"county_fips_code\", \"dt\", \"age_group\", ('cnt', 'Laboratory-confirmed case'), ('cnt', 'Probable Case')]]\n",
    "data_pivoted.columns = [\"county_fips_code\", \"dt\", \"age_group\",  'Laboratory-confirmed case', 'Probable Case']\n",
    "daily_data = data_pivoted[[\"dt\", \"county_fips_code\", \"Laboratory-confirmed case\", \"Probable Case\"]].groupby([\"dt\",\"county_fips_code\"] ).sum().reset_index()\n",
    "daily_age_group_data = data_pivoted[[\"dt\", \"county_fips_code\", \"age_group\",  \"Laboratory-confirmed case\", \"Probable Case\"]].groupby([\"dt\",\"county_fips_code\", \"age_group\"] ).sum().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "population_data = pd.read_csv(\"./data/burbio/county_population.csv\")[[\"Fips code\", \"POPESTIMATE2019\"]].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_age_group_data[\"county_fips_code\"] = daily_age_group_data.county_fips_code.replace('NA', None).astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_age_group_data = pd.merge(daily_age_group_data, population_data, left_on=\"county_fips_code\", right_on=\"Fips code\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_age_group_data[\"lab_confirmed_cases_per_pop\"] = daily_age_group_data[\"Laboratory-confirmed case\"]*1000.0/daily_age_group_data[\"POPESTIMATE2019\"]\n",
    "daily_age_group_data[\"all_cases_per_pop\"] = (daily_age_group_data[\"Laboratory-confirmed case\"] + daily_age_group_data[\"Probable Case\"])*1000.0/daily_age_group_data[\"POPESTIMATE2019\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "daily_age_group_data.dt = pd.to_datetime(daily_age_group_data.dt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data = daily_age_group_data[[\"age_group\", \"lab_confirmed_cases_per_pop\", \"all_cases_per_pop\"] + [\"county_fips_code\", \"dt\"]].groupby([\"county_fips_code\", \"age_group\"]).resample('W-Wed', label=\"right\", closed=\"right\", on=\"dt\").mean().sort_values(by=\"dt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "weekly_data[[\"lab_confirmed_cases_per_pop\", \"all_cases_per_pop\"]].reset_index().to_csv(\"./data/weekly_cases_per_1k_with_age_groups.csv\", header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
